## Import libraries
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
import glob
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import seaborn as sns
import subprocess
from sklearn.manifold import TSNE
import plotly.express as px
from umap import UMAP
from sklearn import datasets
### Detect HPC or not
import socket
HOST = socket.gethostname()
## Set the working directory
if HOST == 'Gandalf':
os.chdir("/Users/jilke/Scripts/RRBSonCUPs/vsc42927")
else:
os.chdir("/scratch/gent/vo/000/gvo00027/")
## I/O files
refs = "./reference/cfRRBSreferences/new-clusters_test/2022-06-23_FFPE-ref-samples_clustered_DSS-DMRs-17T.csv"
ref_metadata = "./reference/cfRRBSreferences/new-clusters_test/2022-06-23_FFPE-ref-metadata_clusters_DSS-DMRs-17T.csv"
refList = "./sample_lists/refList_age_v2.csv"
outputfolder = "./projects/2020-05_variance_analysis/tSNE"
UMAPoutputfolder = "./projects/2020-05_variance_analysis/UMAP"
## determine minimal coverage
filter = 30
## apply coverage or count cut-off
samples_df = pd.read_csv(refs, index_col='clusterID')
samples_df.sort_values(by='clusterID', inplace=True)
# calculate the amount of clusters in each sample
cluster_count = pd.DataFrame()
for sample in list(samples_df):
cluster_count.loc[sample,'total'] = len(samples_df)-samples_df[sample].isna().sum()
# remove clusters with low coverage/count and calculate amount of remaining clusters
metadata_df = pd.read_csv(ref_metadata, index_col='clusterID')
filter_df = metadata_df.filter(regex='depth') # if NGS: regex='depth', if array: regex='count'
filter_df.rename(columns = lambda x: x.rstrip(" depth"), inplace=True) # if TCGA or own samples: x.split(' ')[0]
for sample in list(samples_df):
samples_df[sample] = samples_df[sample].mask(filter_df[sample] < filter)
cluster_count.loc[sample,'remaining'] = len(samples_df)-samples_df[sample].isna().sum()
cluster_count.loc[sample,'filtered'] = cluster_count.loc[sample,'total']-cluster_count.loc[sample,'remaining']
cluster_count['sampleID'] = cluster_count.index.values
samples_df.dropna(inplace=True)
## rename samples and remove unused refs
names_df = pd.read_csv(refList, sep=',',index_col=[0],usecols=['smapleID','name'])
samples_df = samples_df.transpose()
samples_df = pd.merge(samples_df,names_df,how='left',left_index=True,right_index=True)
samples_df = samples_df.set_index('name')
samples_df = samples_df.transpose()
samples_df.index.names = ['clusterID']
samples_df = samples_df[samples_df.columns.dropna()]
## remove unused refs from count df
cluster_count = pd.merge(cluster_count,names_df,how='left',left_index=True,right_index=True)
cluster_count.dropna(inplace=True)
cluster_count = cluster_count[~cluster_count['name'].str.contains('ESADy')]
## drop not-used refs
samples_df.drop(columns=list(samples_df.filter(regex='ESADy')),inplace=True)
Primary tumor samples (n=80) were collected that were less than 3 years old, preferably low-grade, with multiple blocks available and a clear-cut diagnosis. Tumor tissue from patients who received neoadjuvant therapy was excluded. From each of the following tumor entities (n=16), 5 samples were collected: lobular (BRCAl) and ductal (BRCA) breast cancer (invasive carcinoma NST), cholangiocarcinoma (CHOL), pancreatic ductal carcinoma (PAAD), colorectal (COAD), gastro-esophageal junction (GEJC) and lung adenocarcinoma (LUAD), cutaneous (CSCC), esophageal (ESCC) and lung squamous cell carcinoma (LUSC), diffuse large B-cell lymphoma (DLBL), mesothelioma (MESO), ovarian cancer (high-grade serous carcinoma) (OVCA), small-cell lung carcinoma (SCLC),, prostatic carcinoma (acinar type) (PRAD) and melanoma (SKCM). The estimated tumor percentage range is 10-90% (average 65%).
Because both tissue and blood contain a significant amount of DNA from white blood cells, we collected blood from healthy, volunteering co-workers at our lab to use as references (n=46). As shown in the density plot below, the volunteers had an unequal distribution in sex and age.
HD <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/healthy_donors.csv')
HD_n <- HD %>%
group_by(birth.year, sex) %>%
summarise(n = n())
#HD_n$birth.year <- factor(HD_n$birth.year, levels = HD_n$birth.year[order(HD_n$birth.year)])
ggplot(HD_n, aes(x = birth.year, fill = sex, color = sex)) +
geom_density(alpha=.2)
To reduce any age or sex bias, we divided the range of birth years (1960-1995) in 3 bins of 12 years with 5 samples per bin (both M and F):
This selection gives a total of 15 samples that can be used as a (balanced) healthy reference dataset.
Unsupervised clustering analysis shows that much CpG regions covered by cfRRBS (min. 30x coverage) have a similar methylation percentage across different (tumor) entities, resulting in suboptimal cluster formation (supplementary figures 1-3).
# input
samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/clustered_samples/2022-07-29_FFPE-refs_samples_clusteredV3_cov30x.csv", check.names = FALSE, row.names = 1)
# transpose
samples_transposed <- t(samples)
# hierarchical clustering
dist_refs <- dist(samples_transposed, method = "manhattan")
cluster <- hclust(dist_refs, method = "ward.D2")
plot(cluster, hang = -1)
# remove NA values and transpose
samples <- na.omit(samples)
samples_transposed <- t(samples)
# convert matrix to dataframe and seperate labels
samples_t <- as.data.frame(samples_transposed)
samples_t[c('entity','no')] <- str_split_fixed(row.names(samples_t), " ", 2)
samples_t.labels <- samples_t[c('entity','no')]
samples_t <- subset(samples_t, select = -c(entity,no))
# UMAP
samples_t.umap <- umap(samples_t)
samples.umap <- samples_t.umap$layout %>% as.data.frame() %>% rename(UMAP1="V1",UMAP2="V2") %>% merge(samples_t.labels, by=0)
p <- ggplot(samples.umap, aes(x = UMAP1, y = UMAP2, color = entity)) +
geom_point() +
scale_color_manual(values = color_panel) +
labs(title = "UMAP", x = NULL, y = NULL)
ggplotly(p)
# t-SNE
samples_t.subs <- samples_t[,sample(ncol(samples_t), 15000)]
samples_t.tsne <- Rtsne(samples_t.subs)
samples_t.labels$ID <- 1:nrow(samples_t.labels)
samples.tsne <- samples_t.tsne$Y %>% as.data.frame() %>% mutate(ID=row_number()) %>%
rename(tSNE1="V1",tSNE2="V2") %>% merge(samples_t.labels, by="ID")
p <- ggplot(samples.tsne, aes(x = tSNE1, y = tSNE2, color = entity)) +
geom_point() +
scale_color_manual(values = color_panel) +
labs(title = "t-SNE, subsampled", x = NULL, y = NULL)
ggplotly(p)
# remove NA values and transpose
samples <- na.omit(samples)
samples_transposed <- t(samples)
# convert matrix to dataframe and seperate labels
samples_t <- as.data.frame(samples_transposed)
samples_t[c('entity','no')] <- str_split_fixed(row.names(samples_t), " ", 2)
samples_t.labels <- samples_t[c('entity','no')]
samples_t <- subset(samples_t, select = -c(entity,no))
# UMAP
samples_t.umap <- umap(samples_t)
samples.umap <- samples_t.umap$layout %>% as.data.frame() %>% rename(UMAP1="V1",UMAP2="V2") %>% merge(samples_t.labels, by=0)
ggplot(samples.umap, aes(x = UMAP1, y = UMAP2, color = entity)) +
geom_point() +
scale_color_manual(values = color_panel) +
labs(title = "UMAP", x = NULL, y = NULL)
# t-SNE
samples_t.subs <- samples_t[,sample(ncol(samples_t), 15000)]
samples_t.tsne <- Rtsne(samples_t.subs)
samples_t.labels$ID <- 1:nrow(samples_t.labels)
samples.tsne <- samples_t.tsne$Y %>% as.data.frame() %>% mutate(ID=row_number()) %>%
rename(tSNE1="V1",tSNE2="V2") %>% merge(samples_t.labels, by="ID")
ggplot(samples.tsne, aes(x = tSNE1, y = tSNE2, color = entity)) +
geom_point() +
scale_color_manual(values = color_panel) +
labs(title = "t-SNE, subsampled", x = NULL, y = NULL)
To select differentially methylated regions (DMRs) between (tumor) entities, we tested two software packages: DSS and DMRfinder. To be able to compare both, we only used DMRs from DMRfinder that had a p-value < 0.01 (because this was set as a cut-off in DSS) and we only used DMRs from DSS that had a ß-value difference > 0.1 (because this was set as a cut-off in DSS).
The bar chart below shows the average amount of DMRs found by each tool and how many of those DMRs are also found by the other tool. 63% of DMRs found by DSS are also found by DMRfinder, but only 21% of the DMRs found by DMRfinder are also found by DSS. We know from Liu et al. that DMRfinder has a much higher type I error rate compared to other tools.
DSSvsDMRf <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-07-28_DSSvsDMRfinder.csv')
#DMR_count <- subset(DSSvsDMRf, select = c(DSS, DSS_intersect, DMRf, DMRf_intersect, comparison))
#DMR_count <- melt(DMR_count, id='comparison')
#DMR_count <- rename(DMR_count, tool = variable)
#DMR_count$tool <- factor(DMR_count$tool, levels = c("DSS", "DSS_intersect", "DMRf", "DMRf_intersect"))
#DMR_count[c('t1','t2')] <- str_split_fixed(DMR_count$comparison, "-", 2)
# DMR amount
#p <- ggplot(DMR_count, aes(x = tool, y = value, group = comparison)) +
# geom_point(name = 'comparison') +
# geom_line() +
# labs(y = "DMR amount", title="amount of DMRs found by DSS and DMRfinder", col = NULL)
#ggplotly(p)
# DMR fractions (boxplot)
# DMR_fraction <- subset(DSSvsDMRf, select = c(DSS, DSS_intersect, DMRf, DMRf_intersect))
# DMR_fraction <- rename(DMR_fraction, DSS_total = DSS, DMRf_total = DMRf)
# DMR_fraction <- melt(DMR_fraction, value.name = 'amount')
# #DMR_fraction$variable <- factor(DMR_fraction$variable, levels = c("DSS_total", "DSS_intersect", "DMRf_total", "DMRf_intersect"))
# DMR_fraction[c('tool','fraction')] <- str_split_fixed(DMR_fraction$variable, "_", 2)
#
# ggplot(DMR_fraction, aes(x = tool, y = amount, fill = fraction)) +
# geom_boxplot() +
# labs(y = "DMR amount", title="amount of DMRs found by DSS and DMRfinder", col = NULL)
# DMR fractions (stacked bars)
DSSvsDMRf$DSS_unique <- DSSvsDMRf$DSS - DSSvsDMRf$DSS_intersect
DSSvsDMRf$DMRf_unique <- DSSvsDMRf$DMRf - DSSvsDMRf$DMRf_intersect
DMR_stacked <- data.frame(variable = c('DSS_unique', 'DSS_intersect', 'DMRfinder_unique', 'DMRfinder_intersect'),
value = c(mean(DSSvsDMRf$DSS_unique), mean(DSSvsDMRf$DSS_intersect), mean(DSSvsDMRf$DMRf_unique), mean(DSSvsDMRf$DMRf_intersect)))
DMR_stacked[c('tool','fraction')] <- str_split_fixed(DMR_stacked$variable, "_", 2)
ggplot(DMR_stacked, aes(fill=fraction, x=tool, y=value, label=fraction)) +
geom_bar(position = "stack", stat = "identity") +
labs(y = NULL, title="Fraction of common and unique DMRs", col = NULL, x = NULL)
We would expect a correlation between the amount of DMRs found by each tool. In case of a comparison between highly similar tumor entities such as pancreas (PAAD) and cholangiocarcinoma (CHOL) for example, we expect that both DSS and DMRfinder find fewer DMRs. Indeed, this seems to be the case in roughly half of comparisons, but in the other half, DMRfinder seems to find relatively much more DMRs compared to DSS. Each dot represents the comparison of two (tumor) entities:
# DMR amount correlation
DSSvsDMRf[c('t1','t2')] <- str_split_fixed(DSSvsDMRf$comparison, "-", 2)
p <- ggplot(DSSvsDMRf, aes(x = DSS, y = DMRf, group = comparison, colour = t1, fill = t2)) +
geom_point(name = 'comparison', size = 4, stroke = 1.5) +
scale_color_manual(values = color_panel) +
scale_fill_manual(values=color_panel) +
theme(legend.position='none') +
labs(title = "amount of DMRs per comparison", y = "DMRfinder")
ggplotly(p)
Finally, we compared the length of the DMRs found by DSS and DMRfinder. As expected, the mean length of DSS-DMRs is greater than DMRfinder-DMRs, because DMRfinder has a default setting where the maximum DMR length is limited to 500bp. However, the median DMR length of both tools is very similar.
# DMR length
DMR_length <- subset(DSSvsDMRf, select = c(DSS_mean.length, DSS_median.length, DMRf_mean.length, DMRf_median.length))
DMR_length <- rename(DMR_length, DSS_mean = DSS_mean.length, DSS_median = DSS_median.length, DMRfinder_mean = DMRf_mean.length, DMRfinder_median = DMRf_median.length)
DMR_length <- melt(DMR_length, value.name = 'length')
DMR_length[c('tool','summary')] <- str_split_fixed(DMR_length$variable, "_", 2)
ggplot(DMR_length, aes(x = summary, y = length, fill = tool)) +
geom_boxplot() +
labs(y = "DMR length", title="length of DMRs", col = NULL, x = NULL)
Because the classifier performed worse if based on DMRs selected by DMRfinder compared to DSS, we decided to continue with DSS.
For each (tumor) entity, 200 ‘defining’ DMRs are selected. These are DMRs that can differentiate a (tumor) entity from all other entities (n=16). If there are less than 200 defining DMRs available for a certain entity, DMRs are selected that can differentiate this entity from all entities except one (n = 15). If the sum of both n16-DMRs and n15-DMRs is still less than 200, DMRs are selected that differentiate from 14 (tumor) entities, and so on.
The stacked bar plot below shows the DMR composition per tumor entity. Darker blue means the selected DMRs can distinguish this entity from fewer other entities. The amount of DMRs is depicted on the bars.
count <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-02_DSS-17T-count.csv')
count <- rename(count, type_count = X)
count <- melt(count, id='type_count')
count <- count[order(-count$type_count),]
count <- rename(count, tumor_type = variable)
ggplot(count, aes(fill=type_count, x=tumor_type, y=value, label=value)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) +
labs(fill="no of other\nentities", x = NULL, y = NULL, title = "DMR count per entity (total = 200)") +
theme(panel.background = element_rect(fill = 'white', colour = 'white'),
axis.text.x=element_text(angle = 45),
axis.ticks.x=element_blank())
# same plot with different color for each entity
# ggplot(count, aes(fill=tumor_type, x=tumor_type, y=value, label=value)) +
# geom_bar(position = "stack", stat = "identity", aes(alpha=type_count)) +
# scale_alpha(range = c(0.1,1)) +
# geom_text(position = position_stack(vjust = 0.5)) +
# scale_fill_manual(values=color_panel) +
# labs(fill="no of entities", x = NULL, y = NULL) +
# theme(panel.background = element_rect(fill = 'white', colour = 'white'),
# axis.text.x=element_text(angle = 45),
# axis.ticks.x=element_blank())
Most entities largely consist of DMRs that cannot differentiate against all other entities. Cholangiocarcinoma (CHOL) for example, only has 3 defining n16-DMRs. Pancreas adenocarcinomas (PAAD) and lung squamous cell carcinomas (LUSC) have no defining DMRs at all. To give a counterexample, small cell lung cancer (SCLC) has at least 200 defining DMRs.
If we look at the n15-DMRs (all entities except one), we were interested to know which tumor entities are most frequently missing from these DMRs. To further clarify this, the table below gives an example in the case of ductal breast carcinoma (BRCA). For this tumor entity, all n15-DMRs on chromosome 1 (6 in total) are shown, meaning that these regions are differentially methylated in BRCA compared to all other entities, except one. Each column contains the ß-value difference between BRCA and another entity for those specific regions. Note that to be considered a ‘true’ DMR, the region must be the highest or the lowest methylated in BRCA. In other words, the difference must be always negative or always positive in each comparison.
brca_ex <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-03_BRCA_missingDMRs-example.csv', check.names = FALSE)
DT::datatable(brca_ex, rownames = FALSE, caption = "BRCA n15-DMRs", options = list(pageLength = 10, scrollX=T)) %>%
DT::formatStyle(c('chr','start','end'), backgroundColor = 'lightgrey') %>%
DT::formatRound(columns = colnames(brca_ex)[-c(1:3)] ,digits = 2) %>%
DT::formatStyle(colnames(brca_ex)[-c(1:3)], backgroundColor = DT::styleEqual(0, 'lightblue'))
brca_ex <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-03_BRCA_missingDMRs-example.csv', check.names = FALSE)
kable(brca_ex, "latex", booktabs = T) %>%
kable_styling(latex_options = c("scale_down", "hold_position"))
#brca_ex <- kable(brca_ex)
#brca_ex %>% kable_styling(latex_options = "scale_down")
In this example, BRCA is always hypermethylated in 4 DMRs and always hypomethylated in 2 DMRs compared to the other entities. But, in 2 out of 6 DMRs (33%), diffuse large B-cell lymphoma (DLBL) is not differentially methylated from BRCA. In 4 out of 6 regions (67%), lobular breast carcinoma (BRCAl) is not differentially methylated from BRCA.
For all DMRs and for each (tumor) entity, we can thus calculate the percentage of regions where they are not differentially methylated from BRCA, while others are. In the heatmap below, you can see that in the case of BRCA (bottom row), BRCAl is not differentially methylated in 42% of the DMRs.
DMRs <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-02_DSS-17T_true-defDMRs_pct_leave1out.csv')
DMRs[c('entity','amount')] <- str_split_fixed(DMRs$X, "_", 2)
DMRs <- subset(DMRs,select = -c(X))
DMRs_m <- melt(DMRs, id=c('entity','amount'))
DMRs_m <- rename(DMRs_m, percentage = value)
DMRs_m$variable <- as.character(DMRs_m$variable)
DMRs_m <- DMRs_m[order(DMRs_m$entity, DMRs_m$variable),]
# heatmap
ggplot(filter(DMRs_m, amount == 15), aes(x = variable, y = entity, fill = percentage)) +
geom_tile() +
coord_fixed() +
geom_text(aes(label = round(percentage)), color = "white", size = 4) +
labs(x = "missing (tumor) entity", y = NULL, title="Percentage of each entity missing from n15-DMRs", col = NULL)
Note that PAAD is not included in this plot, because PAAD has no n15-DMRs. For other entities such as LUSC, the percentages might not be representative, as LUSC only has 4 n15-DMRs. We therefore repeated this calculation for n14-DMRs (supplementary figure 4).
ggplot(filter(DMRs_m, amount == 14), aes(x = variable, y = entity, fill = percentage)) +
geom_tile() +
coord_fixed() +
geom_text(aes(label = round(percentage)), color = "white", size = 4) +
labs(x = "missing (tumor) entity", y = NULL, title="Percentage of each entity missing from n14-DMRs", col = NULL)
Each row of the n14-DMRs heatmap is also visualized in the bar chart below (supplementary figure 5).
# bar chart
ggplot(filter(DMRs_m, amount == 14), aes(variable, percentage, fill = variable)) +
facet_wrap(~ entity) +
geom_bar(stat="identity", show.legend=FALSE) +
scale_fill_manual(values = color_panel) +
theme(axis.text.x = element_text(angle = -45, hjust = 0), axis.title.x = element_blank()) +
geom_hline(yintercept = 40, linetype = "dashed", color = "darkgrey") +
labs(title="Percentage of each entity missing from n14-DMRs")
We can assume that a higher percentage indicates a higher resemblance between entities, as this means these entities share more regions with similar ß-value. This might be problematic for the 200 DMRs we selected for each entity; to continue with the BRCA example, more than 70% of these DMRs cannot be used to differentiate between ductal and lobular breast carcinoma. It might therefore be better to group these entities together when used in a classifier with multiple other entities.
If we for example define an arbitrary cut-off at >40% and only allow grouping if high similarity is present in both directions (meaning that not only BRCAl is highly similar to BRCA, but BRCA is also highly similar to BRCAl compared to other entities), we can see that besides a BRCA-BRCAl group, we could also group colon and gastro-esophageal junction tumors (COAD-GEJC) and cutaneous, esophageal and lung squamous cell carcinomas (CSCC-ESCC-LUSC). These groupings also make sense from a histopathological viewpoint.
To further explore a possible grouping of (tumor) references, we performed a new unsupervised clustering using only the selected 200 DMRs per entity.
# input
samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/clustered_samples/2022-08-03_FFPE-refs_DSS-DMRs-17T_cov30x.csv", check.names = FALSE, row.names = 1)
samples_noNA <- na.omit(samples)
# transpose
samples_transposed <- t(samples_noNA)
# convert matrix to dataframe and seperate labels
samples_t <- as.data.frame(samples_transposed)
samples_t[c('entity','no')] <- str_split_fixed(row.names(samples_t), " ", 2)
samples_t.labels <- samples_t[c('entity','no')]
samples_t <- subset(samples_t, select = -c(entity,no))
# UMAP
samples_t.umap <- umap(samples_t)
samples.umap <- samples_t.umap$layout %>% as.data.frame() %>% rename(UMAP1="V1",UMAP2="V2") %>% merge(samples_t.labels, by=0)
p <- ggplot(samples.umap, aes(x = UMAP1, y = UMAP2, color = entity)) +
geom_point(size = 3) +
scale_color_manual(values = color_panel) +
labs(title = "UMAP", x = NULL, y = NULL)
ggplotly(p)
# t-SNE
samples_t.tsne <- Rtsne(samples_t)
samples_t.labels$ID <- 1:nrow(samples_t.labels)
samples.tsne <- samples_t.tsne$Y %>% as.data.frame() %>% mutate(ID=row_number()) %>%
rename(tSNE1="V1",tSNE2="V2") %>% merge(samples_t.labels, by="ID")
p <- ggplot(samples.tsne, aes(x = tSNE1, y = tSNE2, color = entity)) +
geom_point() +
scale_color_manual(values = color_panel) +
labs(title = "Figure 1: t-SNE", x = NULL, y = NULL)
ggplotly(p)
Both t-SNE and UMAP do not allow NA values in any row, so only roughly 1/3 of DMRs can be used in these clustering analyses. In hierarchical clustering all DMRs can be used.
# hierarchical clustering
samples_tNA <- t(samples)
dist_refs <- dist(samples_tNA, method = "manhattan")
cluster <- hclust(dist_refs, method = "ward.D2")
# calculate Dunn index
dunn.lst <- list()
for (i in 2:30) {
name <- as.character(i)
cutCluster <- cutree(cluster,i)
dunn.lst[[name]] <- dunn(dist_refs, cutCluster)
}
# plot dunn index
dunn.df <- do.call(cbind, dunn.lst)
dunn.df <- melt(dunn.df)
dunn.df <- rename(dunn.df, "amount" = "Var2")
ggplot(dunn.df, aes(x = amount, y = value)) +
geom_point() +
geom_line() +
labs(title = "Optimal number of clusters", subtitle = "Dunn Index: temporary maximum at 7-11 clusters", x = "cluster amount", y = "Dunn Index")
# Silhouette method
fviz_nbclust(samples_transposed, kmeans, method = "silhouette", k.max = 30) +
labs(subtitle = "Silhouette method: 14 clusters")
# plot hierarchical clustering
plot(cluster, hang = -1)
abline(h = 725, col = 'blue')
abline(h = 560, col = 'red')
#rect.hclust(cluster, k = 15, border = 2:6)
#ggdendrogram(cluster, rotate = TRUE)
# Gap statistic
# nboot = 50 to keep the function speedy.
#set.seed(123)
#fviz_nbclust(samples_transposed, kmeans, nstart = 25, method = "gap_stat", nboot = 50, k.max = 30) +
# labs(subtitle = "Gap statistic method: 19 clusters")
#ggsave("/Users/jilke/Scripts/RRBSonCUPs/projects/2022-06_final-scripts_paper/gap_stat_refs.png")
knitr::include_graphics("/Users/jilke/Scripts/RRBSonCUPs/projects/2022-06_final-scripts_paper/gap_stat_refs.png")
To determine the optimal number of groups, different methods can be used such as the Dunn index, silhouette method or gap statistic. However, all methods give different outcomes ranging from 7 to 19 clusters, which is not surprising given the relatively small sample size of highly variable tumor entities. We therefore took a pragmatic approach based on unsupervised clustering, n15/n14-ESR analysis and histopathological information.
The blue line on the dendrogram indicates the optimal groups. Although clustering has improved with DMR selection, there are still some entities that are grouped incorrectly.
Here we can also see that if we would want to keep BRCA and BRCAl as seperate groups in our classifier, this would also mean that entities such as OVCA, ESCC, SKCM etc. should be further subdivided (red line). In other words, the variance between BRCA and BRCAl is smaller than within an entity such as OVCA.
Grouping of the squamous cell carcinomas (CSCC-ESCC-LUSC) is also supported by unsupervised clustering, although LUSC seems to be very heterogeneous.
In all clustering analyses, COAD and GEJC are seen as clearly seperated groups, a gastro-intestinal group COAD-GEJC is therefore not desirable.
PAAD was thrown out as a reference, for two reasons:
An argument can be made to include a CHOL-GEJC group; they cluster together and GEJC is missing in > 40% of DMRs in CHOL.
# input
samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/reference/cfRRBSreferences/new-clusters_test/2022-06-23_FFPE-ref-samples_clustered_DSS-DMRs-17T.csv")
refList <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/refList_age_v2.csv")
# transpose and name columns
samples_transposed <- t(samples)
colnames(samples_transposed) <- head(samples_transposed,1)
samples_transposed <- samples_transposed[-1,]
samples_transposed <- merge(samples_transposed, refList, by.x = 0, by.y = "smapleID")
samples_transposed <- samples_transposed[,-1]
rownames(samples_transposed) <- samples_transposed$name
samples_transposed <- subset(samples_transposed,select = -c(name))
samples_transposed <- samples_transposed[!(row.names(samples_transposed) %in% c("ESADy 2","ESADy 5","ESADy 6")),]
# hierarchical clustering
dist_refs <- dist(samples_transposed, method = "manhattan")
cluster <- hclust(dist_refs, method = "ward.D2")
#plot(cluster, hang = -1)
## heatmap
samplesT_WOna <- samples_transposed[ , colSums(is.na(samples_transposed)) == 0]
samplesT_WOna <- as.matrix(samplesT_WOna)
heatmap.2(samplesT_WOna, trace="none",
hclustfun = function(dist_refs) hclust(dist_refs, method = "ward.D2"),
cexRow = 0.5, labCol = FALSE, col=brewer.pal(9,"Blues"), dendrogram = "row")
In conclusion, a classifier was built with 12 references, which includes 3 grouped entities:
All regions with less than 30x coverage are thrown out.
cluster_count <- as.data.frame(do.call(cbind, py$cluster_count))
cluster_count$total <- NULL
cluster_count <- melt(cluster_count, id=c('sampleID','name'))
cluster_count <- rename(cluster_count, clusters = variable)
cluster_count$value <- as.numeric(cluster_count$value)
cluster_count$clusters <- as.character(cluster_count$clusters)
cluster_count <- cluster_count[order(cluster_count$name, cluster_count$clusters),]
ggplot(cluster_count, aes(fill=clusters, x=value, y=name)) +
geom_bar(position = "stack", stat = "identity")
For each (grouped) entity, 200 ‘defining’ DMRs are selected. These are DMRs that can differentiate a (tumor) entity from all other entities (n=11). If there are less than 200 defining DMRs available for a certain entity, DMRs are selected that can differentiate this entity from all entities except one (n = 10). If the sum of both n11-DMRs and n10-DMRs is still less than 200, DMRs are selected that differentiate from 9 (tumor) entities, and so on.
The stacked bar plot below shows the DMR composition per group. Darker blue means the selected DMRs can distinguish this entity from fewer other entities. The amount of DMRs is depicted on the bars.
count <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-08_DSS-12G-count.csv')
count <- rename(count, type_count = X)
count <- melt(count, id='type_count')
count <- count[order(-count$type_count),]
count <- rename(count, tumor_type = variable)
ggplot(count, aes(fill=type_count, x=tumor_type, y=value, label=value)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) +
scale_fill_gradient2(midpoint = 8.5) +
labs(fill="no of other\nentities", x = NULL, y = NULL, title = "Figure 2: DMR count per (grouped) entity (total = 200)") +
theme(panel.background = element_rect(fill = 'white', colour = 'white'),
axis.text.x=element_text(angle = 45),
axis.ticks.x=element_blank())
For all DMRs and for each (grouped) entity, we can calculate the percentage of regions where they are not differentially methylated from a certain entity, while others are.
DMRs <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2021-02_additional-DMRselection/compare_DMRs/output/2022-08-08_DSS-12G_true-defDMRs_pct_leave1out.csv')
DMRs[c('entity','amount')] <- str_split_fixed(DMRs$X, "_", 2)
DMRs <- subset(DMRs,select = -c(X))
DMRs_m <- melt(DMRs, id=c('entity','amount'))
DMRs_m <- rename(DMRs_m, percentage = value)
DMRs_m$variable <- as.character(DMRs_m$variable)
DMRs_m <- DMRs_m[order(DMRs_m$entity, DMRs_m$variable),]
# heatmap
ggplot(filter(DMRs_m, amount == 10), aes(x = variable, y = entity, fill = percentage)) +
geom_tile() +
coord_fixed() +
geom_text(aes(label = round(percentage)), color = "white", size = 4) +
labs(x = "missing (tumor) entity", y = NULL, title="Percentage of each entity missing from n10-DMRs", col = NULL)
The percentages might not be representative, as for example HIGI only has 8 n10-DMRs. We therefore repeated this calculation for n9-DMRs.
ggplot(filter(DMRs_m, amount == 9), aes(x = variable, y = entity, fill = percentage)) +
geom_tile() +
coord_fixed() +
geom_text(aes(label = round(percentage)), color = "white", size = 4) +
labs(x = "missing (tumor) entity", y = NULL, title="Percentage of each entity missing from n9-DMRs", col = NULL)
Each row of the n9-DMRs heatmap is also visualized in the bar chart below.
# bar chart
ggplot(filter(DMRs_m, amount == 9), aes(variable, percentage, fill = variable)) +
facet_wrap(~ entity) +
geom_bar(stat="identity", show_guide=FALSE) +
scale_fill_manual(values = color_panel_12G) +
theme(axis.text.x = element_text(angle = -45, hjust = 0), axis.title.x = element_blank()) +
geom_hline(yintercept = 40, linetype = "dashed", color = "darkgrey") +
labs(title="Percentage of each entity missing from n9-DMRs")
More balanced DMRs, except for DLBL, which is missing in > 40% of DMRs in the majority (7/12) of entities. COAD-HIGI, just like COAD-GEJC before, are still highly similar.
We first tested our approach on a retrospective collection of FFPE tissue biopsies of clinical CUPS for which a diagnosis was available after multiple staining rounds. For the routine pathological work-up of these samples an average number of 9 IHC stainings were performed (range: 3 - 20), counting only stainings that predict tissue of origin, not therapeutic response. If possible, a maximum of 150 to 200ng of FFPE-DNA was used as input. However, for 8 cases only fine needle aspiration (FNA) or core needle biopsy (CNB) samples were available, resulting in a DNA input ranging from 13 to 81ng (supplementary table 3). As shown in figure 3, all retrospective samples were classified correctly (18/18). The lowest predicted tumor percentage was 35%.
Additionally, the classifier was tested on tissue samples collected prospectively from patients who also donated blood. The majority of samples (9/12) were FNA or CNB samples, with DNA input ranging from < 0.5 to 99.4ng (supplementary table 3). Here, 3 out of 12 samples were misclassified: two LUAD samples and one HIGI sample. In all 3 cases, methylation profiling of the blood ctDNA provided the correct diagnosis (results below).
# methatlas results
gen.results <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/methAtlas/new-clusters_test/2022-08-30_all-samples_30x_DSS-DMRs-12G_methAtlas_deconv_output_no-meso.csv", check.names=FALSE)
gen.results <- results_calc(gen.results)
# FFPE (retrospective) samples
retrosp.samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/sampleList_FFPE-CUPs.csv", sep = ';')
retrosp.samples <- rename(retrosp.samples, ID = smapleID)
# FFPE biopsy (prospective) samples
biopsy.samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/sampleList_biopsies.csv", sep = ';')
### get correct diagnosis from plasma sample table
plasma.samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/sampleList_plasma-CUPs.csv", sep = ';')
plasma.samples <- rename(plasma.samples, ID = smapleID)
biopsy.samples_merged <- merge(biopsy.samples, plasma.samples, on = "name")
biopsy.samples <- subset(biopsy.samples_merged, select = -c(ID))
biopsy.samples <- rename(biopsy.samples, ID = smapleID)
# bind prospective and retrospective FFPE samples
biopsy.samples$collection <- "prospective"
retrosp.samples$collection <- "retrospective"
retrosp.samples$CUP <- "yes"
FFPE.samples <- rbind(biopsy.samples, retrosp.samples)
### evaluate methatlas prediction
FFPE_gen.results <- eval_results(FFPE.samples, gen.results)
# methatlas classification results
ggplot(FFPE_gen.results, aes(x=eval, y=tumorpercentage*100, color = classification, shape = collection, stroke = 2)) + #size = ratio
geom_point(data = FFPE_gen.results[FFPE_gen.results$eval == "false",], size = 6) +
geom_quasirandom(data = FFPE_gen.results[FFPE_gen.results$eval == "correct",], width = 0.7, size = 6) +
scale_color_manual(values = color_panel_12G, drop = FALSE) +
scale_shape_manual(values = c(16,1)) +
scale_x_discrete(drop = FALSE) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,100) +
labs(x = NULL, y = "tumor percentage", title="Figure 3: classification results on FFPE tissue") +
theme_classic()
# not used anymore for plot
#geom_jitter(data = FFPE_gen.results[FFPE_gen.results$eval == "correct",], width = 0.3) +
#geom_dotplot(binaxis = 'y', stackdir = "center", method="dotdensity", stackgroups = T,binpositions="all", binwidth = 4)
# evaluate celfie prediction
#FFPE_gen.results_celfie <- eval_results(FFPE.samples, gen.results_celfie)
# celfie classification results
# ggplot(FFPE_gen.results_celfie, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio)) +
# geom_jitter(width = 0.3) +
# scale_color_manual(values = color_panel_12G) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
# ylim(0,100) +
# labs(x = NULL, y = "tumor percentage", title="CelFiE classification results: 13/19 correct")
# interactive plot to combine diagnosis and classification
# p <- ggplot(FFPE.samples, aes(x=eval, y=tumorpercentage, color = diagnosis, fill = classification, group = smapleID)) +
# geom_point(data = FFPE.samples[FFPE.samples$eval == "false",], size = 5, stroke = 1.5) +
# geom_jitter(data = FFPE.samples[FFPE.samples$eval == "correct",], size = 5, stroke = 1.5) +
# scale_color_manual(values = color_panel) +
# scale_fill_manual(values = color_panel_12G) +
# labs(x = NULL)
# ggplotly(p)
Subsequent classification for the HIGI samples could correctly subdivide 6/8 retrospective and 3/4 prospective samples into a gastro-esophageal junction (GEJC) or pancreatico-biliary (CHOL) origin (9/12). Both retrospective BRCAc samples were correctly classified as ductal breast carcinoma in the BRCAc-specific subclassification, but the prospective BRCAc sample was misclassified as lobular breast carcinoma (2/3). Subclassification of the two prospective SCC samples proved the most challenging. One sample is suspected to be of head and neck origin; the patient had a PET-positive focus at the hypopharynx, but this could not be verified with endoscopy. Although this specific malignancy is not in our classifier, it is not farfetched to take ESCC as the closest alternative. However, our classifier predicted LUSC, while the suspected LUSC sample was classified as ESCC. We have conservatively labeled both predictions as false (0/2), although verification is not possible.
# HIGI specific classification
higi.results <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/vsc42927/methAtlas/new-clusters_test/2022-09-12_all-samples_30x_DSS-DMRs-HIGI_methAtlas_deconv_output.csv', check.names=FALSE)
# calculate 1st and 2nd highest tumor fraction and ratio
higi.results <- results_calc(higi.results)
# select HIGI results from gen.results
FFPE_higi.samples <- FFPE_gen.results[(FFPE_gen.results$classification == "HIGI")&(FFPE_gen.results$eval == "correct"),]
FFPE_higi.results <- higi.results[which(higi.results$ID %in% FFPE_higi.samples$ID),]
# evaluate prediction
FFPE_higi.results <- eval_results(FFPE.samples, FFPE_higi.results)
# BRCAc specific classification
BRCAc.results <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/vsc42927/methAtlas/new-clusters_test/2022-09-12_all-samples_30x_DSS-DMRs-BRCAc_methAtlas_deconv_output.csv', check.names=FALSE)
# calculate 1st and 2nd highest tumor fraction and ratio
BRCAc.results <- results_calc(BRCAc.results)
# select BRCAc results from gen.results
FFPE_BRCAc.samples <- FFPE_gen.results[(FFPE_gen.results$classification == "BRCAc")&(FFPE_gen.results$eval == "correct"),]
FFPE_BRCAc.results <- BRCAc.results[which(BRCAc.results$ID %in% FFPE_BRCAc.samples$ID),]
# evaluate prediction
FFPE_BRCAc.results <- eval_results(FFPE.samples, FFPE_BRCAc.results)
# SCC specific classification
scc.results <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/vsc42927/methAtlas/new-clusters_test/2022-09-12_all-samples_30x_DSS-DMRs-SCC_methAtlas_deconv_output.csv', check.names=FALSE)
# calculate 1st and 2nd highest tumor fraction and ratio
scc.results <- results_calc(scc.results)
# select SCC results from gen.results
FFPE_scc.samples <- FFPE_gen.results[(FFPE_gen.results$classification == "SCC")&(FFPE_gen.results$eval == "correct"),]
FFPE_scc.results <- scc.results[which(scc.results$ID %in% FFPE_scc.samples$ID),]
# evaluate prediction
FFPE_scc.results <- eval_results(FFPE.samples, FFPE_scc.results)
# combine results
FFPE_sub.results <- rbind(FFPE_higi.results,FFPE_BRCAc.results,FFPE_scc.results)
# Subclassification results
ggplot(FFPE_sub.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio, shape = collection, stroke = 2)) +
geom_quasirandom(width = 0.3) +
scale_color_manual(values = color_panel_sub) +
scale_shape_manual(values = c(1,16)) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,100) +
labs(x = NULL, y = "tumor percentage", title="Subclassification results (HIGI, BRCAc and SCC: 11/17 correct") +
theme_classic()
The predicted tumor percentage is the highest tumor fraction T1 in the deconvolution of a given sample with the methAtlas algorithm. The prediction ratio is calculated as 1 – T2/T1 with T2 being the second highest predicted tumor fraction. A logistic regression model fitted to the prediction ratio has an AUC of 0.886 (supplementary figure 7).
FFPE_higi.results$eval <- ifelse(FFPE_higi.results$eval == "false", "false subtype", "correct subtype")
FFPE_BRCAc.results$eval <- ifelse(FFPE_BRCAc.results$eval == "false", "false subtype", "correct subtype")
FFPE_scc.results$eval <- ifelse(FFPE_scc.results$eval == "false", "false subtype", "correct subtype")
FFPE.results <- rbind(FFPE_gen.results, FFPE_higi.results, FFPE_BRCAc.results, FFPE_scc.results)
FFPE.results$eval <- factor(FFPE.results$eval, levels = c("correct", "correct subtype", "false subtype", "false"))
FFPE.results$step <- ifelse(grepl("subtype", FFPE.results$eval), "subclassification", "general classification")
adjusted.color_panel <- c("#B2DF8A","#A65628","#A6CEE3","#F781BF","#FFFF33","#000000", "#FF7F00","#33A02C","#FB9A99","#FDBF6F", "#999999", "#A6761D", "#1F78B4", "#810F7C","#E7298A", "#CAB2D6", "#E41A1C")
names(adjusted.color_panel) <- c('HIGI','COAD','LUAD','BRCA','PRAD','SKCM','GEJC','CHOL','BRCAl','SCC','DLBL','ESCC','LUSC','MESO','OVCA','SCLC','healthy')
adjusted.color_panel <- adjusted.color_panel[sort(names(adjusted.color_panel))]
# generate a ROC curve for the prediction ratio
FFPE.results$ROC_label <- ifelse(FFPE.results$eval == "false" | FFPE.results$eval == "false subtype", 0, 1)
glm.fit <- glm(ROC_label ~ ratio, data = FFPE.results, family = binomial)
xweight <- seq(0, 10, 0.01)
yweight <- predict(glm.fit, list(ratio = xweight), type="response")
plot(x=FFPE.results$ratio, y=FFPE.results$ROC_label)
lines(xweight, yweight)
# make the ROC plot square
par(pty = "s")
# ROC plot
roc(FFPE.results$ROC_label, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, print.auc=TRUE, main = "Supplementary figure 7: ROC curve of the prediction ratio in FFPE samples")
##
## Call:
## roc.default(response = FFPE.results$ROC_label, predictor = glm.fit$fitted.values, plot = TRUE, legacy.axes = TRUE, print.auc = TRUE, main = "Supplementary figure 7: ROC curve of the prediction ratio in FFPE samples")
##
## Data: glm.fit$fitted.values in 9 controls (FFPE.results$ROC_label 0) < 38 cases (FFPE.results$ROC_label 1).
## Area under the curve: 0.886
We can observe that the prediction accuracy reaches 97% if the prediction ratio is 0.5 or more, as shown in figure 4. Although there are not enough samples with low predicted tumor percentage to define a cut-off, we can assume that this variable also correlates with prediction accuracy (Figure 3). Two samples with the lowest tumor percentage (below 16%) received a false diagnosis. By definition, the prediction ratio is dependent on the predicted tumor percentage (figure 5). Both are important factors to consider, which is illustrated by the two misclassified LUAD samples in the general classification (misclassified as HIGI and SCC). These 2 cases have a low prediction ratio (0.04 and 0.35 respectively), but a predicted tumor percentage above 16% (36% and 28% respectively). For both cases LUAD is the second highest predicted tumor fraction.
# FFPE.results_false <- FFPE_gen.results[FFPE_gen.results$eval == "false",]
# FFPE.results <- FFPE_gen.results[(FFPE_gen.results$classification != "HIGI") & (FFPE_gen.results$classification != "BRCAc") & (FFPE_gen.results$classification != "SCC") & (FFPE_gen.results$eval == "correct"),]
# FFPE_higi.results$eval <- ifelse(FFPE_higi.results$eval == "false", "false subtype", FFPE_higi.results$eval)
# FFPE_BRCAc.results$eval <- ifelse(FFPE_BRCAc.results$eval == "false", "false subtype", FFPE_BRCAc.results$eval)
# FFPE_scc.results$eval <- ifelse(FFPE_scc.results$eval == "false", "false subtype", FFPE_scc.results$eval)
# FFPE.results <- rbind(FFPE.results, FFPE.results_false, FFPE_higi.results, FFPE_BRCAc.results, FFPE_scc.results)
#
# ggplot(FFPE.results, aes(y = tumorpercentage*100, x = ratio, color = classification, shape = eval)) +
# geom_point(size = 4, stroke = 2) +
# scale_color_manual(values = adjusted.color_panel) +
# scale_shape_manual(values = c(16,17,15)) +
# geom_vline(xintercept = 0.5, linetype = "dashed", color = "darkgrey") +
# annotate("text", x = 0.48, y = 100, label = "0.4", angle = 90, color = "darkgrey") +
# labs(y = "tumor percentage", x = "prediction ratio", title = "correlation between ratio, tumorpercentage and prediction") +
# theme_classic()
#
# FFPE.results$eval <- factor(FFPE.results$eval, levels = c("correct", "false subtype", "false"))
#
# ggplot(FFPE.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio, shape = collection, stroke = 2)) +
# geom_quasirandom() +
# scale_color_manual(values = adjusted.color_panel) +
# scale_shape_manual(values = c(16,17)) +
# scale_x_discrete(drop = FALSE) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
# ylim(0,100) +
# labs(x = NULL, y = "tumor percentage") +
# theme_classic()
#
# ggplot(FFPE.results, aes(x=eval, y=ratio, color = classification, shape = collection, size = 3)) +
# geom_quasirandom() +
# scale_color_manual(values = adjusted.color_panel) +
# scale_shape_manual(values = c(16,17)) +
# scale_x_discrete(drop = FALSE) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
# ylim(0,1) +
# labs(x = NULL, y = "prediction ratio") +
# theme_classic()
# # correlation between tumorpercentages of same sample with general or subclassification
#
# FFPE_higi.false_subtype <- FFPE_higi.results[(FFPE_higi.results$eval == "false subtype"),]
# FFPE_higi.subset <- FFPE_gen.results[which(FFPE_gen.results$ID %in% FFPE_higi.false_subtype$ID),]
# FFPE_BRCAc.false_subtype <- FFPE_BRCAc.results[(FFPE_BRCAc.results$eval == "false subtype"),]
# FFPE_BRCAc.subset <- FFPE_gen.results[which(FFPE_gen.results$ID %in% FFPE_BRCAc.false_subtype$ID),]
# FFPE_scc.false_subtype <- FFPE_scc.results[(FFPE_scc.results$eval == "false subtype"),]
# FFPE_scc.subset <- FFPE_gen.results[which(FFPE_gen.results$ID %in% FFPE_scc.false_subtype$ID),]
# FFPE.results <- rbind(FFPE.results, FFPE_higi.subset, FFPE_BRCAc.subset, FFPE_scc.subset)
# FFPE.results$eval <- factor(FFPE.results$eval, levels = c("correct", "false subtype", "false"))
#
# ggplot(FFPE.results, aes(x=eval, y=tumorpercentage*100, color = classification, shape = collection, size = 3)) +
# geom_line(aes(group = ID), colour = "darkgrey", size = 1) +
# geom_quasirandom() +
# scale_color_manual(values = adjusted.color_panel) +
# scale_shape_manual(values = c(16,17)) +
# scale_x_discrete(drop = FALSE) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
# ylim(0,100) +
# labs(x = NULL, y = "tumor percentage") +
# theme_classic()
ggplot(FFPE.results, aes(x=eval, y=ratio, color = classification, shape = collection, stroke = 2)) +
# geom_line(aes(group = ID), colour = "darkgrey", size = 1) +
facet_wrap(~step, scales = "free_x") +
geom_quasirandom(data = FFPE.results[FFPE.results$eval == "correct",], width = 0.5, size = 4) +
geom_quasirandom(data = FFPE.results[FFPE.results$eval == "false",], width = 0, size = 4) +
geom_quasirandom(data = FFPE.results[FFPE.results$eval == "correct subtype",], width = 0.3, size = 4) +
geom_quasirandom(data = FFPE.results[FFPE.results$eval == "false subtype",], width = 0.2, size = 4) +
scale_color_manual(values = adjusted.color_panel) +
scale_shape_manual(values = c(16,1)) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,1) +
labs(x = NULL, y = "prediction ratio", title = "Figure 4: prediction ratio cut-off for accurate prediction") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "darkgrey") +
theme_classic()
ggplot(FFPE.results, aes(y = tumorpercentage*100, x = ratio, color = classification, shape = eval)) +
geom_point(size = 4, stroke = 2) +
scale_color_manual(values = adjusted.color_panel) +
scale_shape_manual(values = c(16,1,2,17), name = "evaluation") +
geom_vline(xintercept = 0.5, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 0.48, y = 100, label = "0.5", angle = 90, color = "darkgrey") +
geom_hline(yintercept = 16, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 1, y = 18, label = "16%", color = "darkgrey") +
labs(y = "tumor percentage", x = "prediction ratio", title = "Figure 5: correlation between prediction ratio, tumorpercentage and prediction accuracy") +
theme_classic()
A preliminary test of the cfRRBS-based classifier was performed on 6 plasma samples from non-CUP patients with known metastatic tumors: namely two melanoma, two lung adenocarcinoma and two pancreas tumor, which were all classified correctly. Next, the classifier was tested on prospectively collected blood samples from CUP patients, summarized in supplementary table 1. As published before by our lab, an accurate classification is highly dependent on the quality of the plasma sample as determined by fragment analysis; a low “cfDNA - high molecular weight DNA ratio” will negatively affect the diagnostic accuracy of cfDNA methylation profiling. Here, the plasma samples could be processed immediately after blood draw, resulting in good quality samples with a cfDNA ratio in the range of 81-99% (average 90%). In one sample, the overall DNA concentration was too low for cfDNA fraction estimation (COAD case). In 10 out of 13 samples, the origin of the ctDNA was correctly predicted, as shown in figure 6. The lowest DNA input amount for which a correct prediction could be made, was 0.4ng (the same COAD case).
# evaluate prediction
plasma_gen.results <- eval_results(plasma.samples, gen.results)
plasma_gen.results <- plasma_gen.results %>% mutate(collection = case_when(CUP == "yes" ~ "prospective",
CUP == "no" ~ "retrospective"))
ggplot(plasma_gen.results, aes(x=eval, y=tumorpercentage*100, color = classification, shape = CUP, stroke = 2)) + #size = ratio,
geom_quasirandom(data = plasma_gen.results[plasma_gen.results$eval == "false",], width = 0.4, size = 6) +
geom_quasirandom(data = plasma_gen.results[plasma_gen.results$eval == "correct",], width = 0.5, size = 6) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(1,16)) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,100) +
labs(x = NULL, y = "tumor percentage", title="Figure 6: plasma classification results, 16/19 correct (10/13 CUPs)") +
theme_classic()
In cases where the plasma cfDNA was misclassified, a correct classification could be obtained from methylation profiling of the tissue samples with a higher prediction ratio (> 0.5) (figure 7). The opposite is also true; the three cases with a false prediction on tissue, had a correct prediction on plasma, as mentioned above (classification of FFPE samples).
# bind biopsies to plasma samples
biopsy_gen.results <- FFPE_gen.results[FFPE_gen.results$collection == "prospective",]
biopsy_gen.results$type <- "tissue"
plasma_subset <- plasma_gen.results[which(plasma_gen.results$ID %in% biopsy.samples_merged$ID),]
plasma_subset$type <- "plasma"
plasma.tissue_gen.results <- rbind(biopsy_gen.results,plasma_subset)
plasma.tissue_gen.results$type[plasma.tissue_gen.results$ID == "DNA058504"] <- "2nd tissue"
plasma.tissue_gen.results$type <- factor(plasma.tissue_gen.results$type, levels = c("plasma","tissue","2nd tissue"))
ggplot(plasma.tissue_gen.results, aes(x=type, y=tumorpercentage*100)) +
geom_point(aes(size = ratio, color = classification, shape = eval)) +
geom_line(aes(group = name)) +
facet_wrap(~ diagnosis) +
scale_color_manual(values = color_panel_12G) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
scale_shape_discrete(name = "evaluation") +
labs(x = NULL, y = "tumor percentage", title = "Figure 7: classification results on CUP plasma samples with matched tissue biopsies from the same patient") +
theme_classic()
Six of the plasma samples (4 CUPs and 2 known metastatic samples) were classified as HIGI. Further subtyping could correctly predict a pancreatico-biliary origin in all samples (6/6). The BRCA plasma sample was correctly subclassified as ductal breast carcinoma (1/1). Only one of the three plasma samples was labeled as SCC in the general classifier. As discussed above, the misclassification of the other two SCC plasma samples was probably due to low ctDNA percentage. The remaining SCC plasma sample could not be further subtyped, as the suspected origin, pancreatic SCC, is a rare malignancy not present in our reference dataset (0/1) (supplementary figure 8).
# select plasma from higi results
plasma_higi.samples <- plasma_gen.results[(plasma_gen.results$classification == "HIGI")&(plasma_gen.results$eval == "correct"),]
plasma_higi.results <- higi.results[which(higi.results$ID %in% plasma_higi.samples$ID),]
# evaluate prediction
plasma_higi.results <- eval_results(plasma.samples, plasma_higi.results)
# select plasma from BRCAc results
plasma_BRCAc.samples <- plasma_gen.results[(plasma_gen.results$classification == "BRCAc")&(plasma_gen.results$eval == "correct"),]
plasma_BRCAc.results <- BRCAc.results[which(BRCAc.results$ID %in% plasma_BRCAc.samples$ID),]
# evaluate prediction
plasma_BRCAc.results <- eval_results(plasma.samples, plasma_BRCAc.results)
# combine results
plasma_sub.results <- rbind(plasma_higi.results,plasma_BRCAc.results)
plasma_sub.results <- plasma_sub.results %>% mutate(collection = case_when(CUP == "yes" ~ "prospective",
CUP == "no" ~ "retrospective"))
plasma_sub.results$eval <- factor(plasma_sub.results$eval, levels = c("correct","false"))
# subclassification results
ggplot(plasma_sub.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio, shape = collection)) +
geom_point(data = plasma_sub.results[plasma_sub.results$eval == "false",]) +
geom_quasirandom(data = plasma_sub.results[plasma_sub.results$eval == "correct",]) +
scale_color_manual(values = color_panel_sub) +
scale_shape_manual(values = c(16,1)) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
scale_x_discrete(drop = FALSE) +
ylim(0,100) +
labs(x = NULL, y = "tumor percentage", title="subclassification results (BRCAc and HIGI): 7/7 plasma samples") +
theme_classic()
#ggplot(plasma.tissue_higi.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio, shape = CUP)) +
# facet_wrap(~ type) +
# geom_point(data = plasma.tissue_higi.results[plasma.tissue_higi.results$eval == "false",]) +
# geom_jitter(data = plasma.tissue_higi.results[plasma.tissue_higi.results$eval == "correct",], width = 0.2) +
# scale_color_manual(values = color_panel_HIGI) +
# scale_shape_manual(values = shape_panel) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
# scale_x_discrete(drop = FALSE) +
# ylim(0,100) +
# labs(x = NULL, y = "tumor percentage", title="HIGI specific classification: 6/6 plasma samples and 3/4 #tissue samples correct")
For plasma samples, there is a significant correlation between the tumor percentage as estimated by methylation profiling and by copy number profiling (r = 0.90, p < .001, supplementary figure 13). To quantify the tumor burden based on copy number, we used the copy number profile abnormality (CPA) score. The CPA score uses segmental Z-scores to quantify the deviation of segments from the normal diploid state.
cpa.plasma <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/CPA/plasma_combined_cpa.csv")
cpa.plasma[c("ID","lane")] <- str_split_fixed(cpa.plasma$file,'_',2)
cpa.plasma <- subset(cpa.plasma, select = -c(file,lane))
cpa.plasma <- merge(cpa.plasma, plasma_gen.results, by="ID")
ggplot(cpa.plasma, mapping = aes(x = cpa, y = tumorpercentage)) +
geom_point(size = 4) +
sm_statCorr() +
theme_classic()
Pleural and peritoneal effusions were collected from patients with metastatic cancer, both known and unknown, who received a diagnostic thoraco- or paracentesis to determine the cause of the effusion. It is therefore possible that the effusion is a consequence of comorbidities such as inflammation, heart failure, etc.
Diagnosis based on cytology was possible in 6 out of 23 pleural fluid samples (3/11 (clinical) CUPs and 3/12 known metastatic samples, supplementary figure 9).
# methatlas results
PE.samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/sampleList_PE-CUPs.csv", sep = ';')
PE.samples <- rename(PE.samples, ID = smapleID)
# evaluate prediction
PE_gen.results <- eval_results(PE.samples, gen.results)
# compare with cytology results
PE_gen.results$cyto <- ifelse(PE_gen.results$cytology == "+", "positive", "negative/inconclusive")
PE_gen.results$cyto <- factor(PE_gen.results$cyto, levels=c("positive", "negative/inconclusive"))
# ## change cytology results to general results to make an easy comparison with methylation profiling
# PE_gen.results$diagnosis_general <- PE_gen.results$diagnosis
# PE_gen.results$diagnosis_general <- ifelse((PE_gen.results$diagnosis == "CHOL") | (PE_gen.results$diagnosis == "GEJC") | (PE_gen.results$diagnosis == "STAD"), "HIGI", PE_gen.results$diagnosis)
# PE_gen.results$diagnosis_general <- ifelse((PE_gen.results$diagnosis == "BRCA") | (PE_gen.results$diagnosis == "BRCAl"), "BRCAc", PE_gen.results$diagnosis)
# PE_gen.results$diagnosis_general <- ifelse((PE_gen.results$diagnosis == "LUSC") | (PE_gen.results$diagnosis == "ESCC"), "SCC", PE_gen.results$diagnosis)
## visualize
ggplot(PE_gen.results, aes(x=cyto, y=tumorpercentage*100, color = diagnosis, shape = CUP)) +
geom_quasirandom(stroke = 2, size = 5) +
scale_color_manual(values = color_panel) +
scale_shape_manual(values = c(1,16)) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title = "cytology results: diagnosis in 6/23 cases") +
theme_classic()
Methylation profiling gave an accurate prediction in 14 of the cases (8/11 (clinical) CUPs and 6/12 known metastatic samples), as shown in figure 8. There was one clinical CUP case (LUAD) where cfDNA methylation profiling gave a false result, but cytology was correct. The opposite was more prevalent: in 9 cases cytology came up negative/inconclusive, but methylation profiling could provide an accurate diagnosis. A cytology sample with inconclusive result means atypical cells were seen, but these could not be further specified (insufficient material for IHC).
ggplot(PE_gen.results, aes(x=eval, y=tumorpercentage*100, color = classification, shape = CUP, stroke = 2)) + #size = ratio
geom_quasirandom(size = 6) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(1,16)) +
# scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title = "Figure 8: PE classification results, 14/23 correct (8/11 CUPs)") +
theme_classic()
For samples that were assigned to a grouped entity (2 BRCAc and 2 SCC), subclassification was only successful in 1 of the 4 samples (supplementary figure 10). The lowest DNA input amount was 1.4ng; this sample was classified correctly (SCLC case).
# BRCAc specific classification
# select BRCAc results from gen.results
PE_BRCAc.samples <- PE_gen.results[(PE_gen.results$classification == "BRCAc")&(PE_gen.results$eval == "correct"),]
PE_BRCAc.results <- BRCAc.results[which(BRCAc.results$ID %in% PE_BRCAc.samples$ID),]
# evaluate prediction
PE_BRCAc.results <- eval_results(PE.samples, PE_BRCAc.results)
PE_BRCAc.results$eval <- factor(PE_BRCAc.results$eval, levels = c("correct","false"))
# select SCC results from gen.results
PE_SCC.samples <- PE_gen.results[(PE_gen.results$classification == "SCC")&(PE_gen.results$eval == "correct"),]
PE_SCC.results <- scc.results[which(scc.results$ID %in% PE_SCC.samples$ID),]
# evaluate prediction
PE_SCC.results <- eval_results(PE.samples, PE_SCC.results)
# combine results
PE_sub.results <- rbind(PE_BRCAc.results,PE_SCC.results)
# subclassification results
ggplot(PE_sub.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio)) +
geom_point() +
scale_color_manual(values = color_panel_sub) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
scale_x_discrete(drop = FALSE) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title="Subclassification results (BRCAc and SCC): 1/4 PE samples") +
theme_classic()
Diagnosis based on cytology was possible in 7 out of 17 ascites samples (1/6 (clinical) CUPs and 6/11 known metastatic samples, supplementary figure 11).
asc.samples <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/sample_lists/sampleList_asc-CUPs.csv", sep = ';')
asc.samples <- rename(asc.samples, ID = smapleID)
# evaluate prediction
asc_gen.results <- eval_results(asc.samples, gen.results)
# compare with cytology results
asc_gen.results$cyto <- ifelse(asc_gen.results$cytology == "+", "positive", "negative/inconclusive")
ggplot(asc_gen.results, aes(x=cyto, y=tumorpercentage*100, color = diagnosis, shape = CUP)) +
geom_quasirandom(stroke = 2, size = 5) +
scale_color_manual(values = color_panel) +
scale_shape_manual(values = c(1,16)) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title="cytology results: diagnosis in 7/17 of cases") +
theme_classic()
Methylation profiling gave an accurate prediction in only 4 of the cases (1/6 CUPs and 3/11 known metastatic samples), as shown in figure 9. The 4 correct cases could also be diagnosed with cytology. Because a lot of samples were misclassified as mesothelioma, we tried deconvolution without the mesothelioma reference; this doubled the number of correct diagnoses (8/17). However, mesothelioma is an important differential in the diagnostic work-up of effusions, results without this reference are therefore not shown here.
ggplot(asc_gen.results, aes(x=eval, y=tumorpercentage*100, color = classification, shape = CUP, size = ratio)) +
geom_quasirandom(stroke = 2) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(1,16)) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title="Figure 9: ascites classification results, 4/17 correct (1/6 CUPs)") +
theme_classic()
One sample was correctly classified as BRCAc, but correct subtyping was again unsuccessful (supplementary figure 12).
# BRCAc specific classification
# select BRCAc results from gen.results
asc_BRCAc.samples <- asc_gen.results[(asc_gen.results$classification == "BRCAc")&(asc_gen.results$eval == "correct"),]
asc_BRCAc.results <- BRCAc.results[which(BRCAc.results$ID %in% asc_BRCAc.samples$ID),]
# evaluate prediction
asc_BRCAc.results <- eval_results(asc.samples, asc_BRCAc.results)
asc_BRCAc.results$eval <- factor(asc_BRCAc.results$eval, levels = c("correct","false"))
# BRCAc classification results
ggplot(asc_BRCAc.results, aes(x=eval, y=tumorpercentage*100, color = classification, size = ratio, shape = CUP)) +
geom_point(data = asc_BRCAc.results[asc_BRCAc.results$eval == "false",], stroke = 2) +
geom_point(data = asc_BRCAc.results[asc_BRCAc.results$eval == "correct",]) +
scale_color_manual(values = color_panel_BRCAc) +
scale_shape_manual(values = 1) +
scale_size_continuous(range = c(3,8), breaks = seq(0,1, by = 0.2)) +
scale_x_discrete(drop = FALSE) +
ylim(0,100) +
labs(y = "tumor percentage", x = NULL, title="Subclassification results (BRCAc): 0/1 ascites sample") +
theme_classic()
To quantify the tumor burden based on copy number, we used the copy number profile abnormality (CPA) score. The CPA score uses segmental Z-scores to quantify the deviation of segments from the normal diploid state.31 For the effusion samples there seems to be no correlation (r = 0.31, p = .051, supplementary figure 14).
# load cpa results from effusion samples
cpa.effusions <- read.csv("/Users/jilke/Scripts/RRBSonCUPs/vsc42927/CPA/effusions_combined_cpa.csv")
cpa.effusions[c("ID","lane")] <- str_split_fixed(cpa.effusions$file,'_',2)
cpa.effusions <- subset(cpa.effusions, select = -c(file,lane))
# merge cpa results with T% results
asc_gen.results$type <- "ascites"
PE_gen.results$type <- "pleural effusion"
asc.PE_gen.results <- rbind(asc_gen.results, PE_gen.results)
cpa.effusions_no.cutoff <- merge(cpa.effusions, asc.PE_gen.results, by="ID")
# plot
ggplot(cpa.effusions_no.cutoff, mapping = aes(x = cpa, y = tumorpercentage)) +
geom_point(size = 4) +
sm_statCorr() +
theme_classic()
Of the 27 effusions samples with negative/inconclusive cytology, 9 could be diagnosed based on cfDNA methylation profiling (33%). On the other hand, there were 13 samples with positive cytology, 4 of which could not be diagnosed based on cfDNA methylation profiling (31%).
# load fragment analyzer results
fa <- read.csv('/Users/jilke/Scripts/RRBSonCUPs/projects/2022-06_final-scripts_paper/FA-Tpct.csv')
fa_gen.results <- merge(asc.PE_gen.results, fa)
# plot
ggplot(fa_gen.results, aes(y = tumorpercentage*100, x = eval, color = classification, shape = type, size = ratio)) +
facet_wrap(~cyto) +
geom_quasirandom(stroke = 2, width = 0.4) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(16,1)) +
labs(y = "tumor percentage", x = NULL, title = "classification results in positive compared with negative/inconclusive cytology samples") +
theme_classic()
Overview of the cfDNA percentage, tumor percentage and classification results of all pleural and peritoneal effusion samples.
fa_gen.results$type_eval <- paste(fa_gen.results$type, fa_gen.results$eval, sep = " ")
ggplot(fa_gen.results, aes(y = tumorpercentage*100, x = cfDNApct, color = classification, shape = type_eval, size = ratio)) +
geom_point(stroke = 3) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(16,17,1,2), name = "type and evaluation") +
labs(x = "cfDNA percentage", y = "tumor percentage") +
geom_vline(xintercept = 40, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 38, y = 100, label = "40%", angle = 90, color = "darkgrey") +
geom_hline(yintercept = 7, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 100, y = 9, label = "7%", color = "darkgrey") +
theme_classic()
Contrary to the plasma samples, the “cfDNA - high molecular weight DNA” ratio showed great variance in the peritoneal and pleural effusion samples, ranging from 1% to 100% (all fragment analysis profiles in supplementary table 5). Figure 11 shows the importance of this preanalytical factor, with classification results of all cfDNA samples (both plasma and effusions) with a cfDNA percentage below or above 40% as determined by Tapestation (Agilent). Correct prediction is highly unlikely if the cfDNA percentage is lower than 40%; only 4 samples out of 21 are classified correctly.
plasma_fa.results <- merge(plasma_gen.results, fa)
plasma_fa.results <- subset(plasma_fa.results, select = -collection)
plasma_fa.results$type <- "plasma"
fa_gen.results.subset <- subset(fa_gen.results, select = -c(type_eval,cyto,cytology))
cfDNA_gen.results <- rbind(fa_gen.results.subset, plasma_fa.results)
cfDNA_gen.results$type <- ifelse((cfDNA_gen.results$type == "pleural effusion") | (cfDNA_gen.results$type == "ascites"), "effusion", cfDNA_gen.results$type)
cfDNA_gen.results$type_eval <- paste(cfDNA_gen.results$type, cfDNA_gen.results$eval, sep = " ")
cfDNA_gen.results$QC <- ifelse(cfDNA_gen.results$cfDNApct >= 40, "cfDNA fraction >= 40%", "cfDNA fraction < 40%")
ggplot(cfDNA_gen.results, aes(y = tumorpercentage*100, x = eval, color = classification, shape = type, stroke = 2)) + #, size = ratio
facet_wrap(~QC) +
geom_quasirandom(width = 0.5, size = 6) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(16,1)) +
labs(y = "tumor percentage", x = NULL, title = "Figure 11: classification results in cfDNA samples with cfDNA% QC cut-off") +
theme_classic()
In effusion samples with a cfDNA percentage above 40%, there is a significant correlation with the CPA score as well (r = 0.63, p = .004). (supplementary figure 15)
QC_cutoff.results <- cfDNA_gen.results[cfDNA_gen.results$cfDNApct >= 40,]
cpa.effusions_cutoff <- merge(cpa.effusions, QC_cutoff.results, by="ID")
ggplot(cpa.effusions_cutoff, mapping = aes(x = cpa, y = tumorpercentage)) +
geom_point(size = 4) +
sm_statCorr() +
labs(title = "Correlation between predicted tumor percentage and CPA score in effusion samples with a cfDNA fraction >= 40%") +
theme_classic()
Here, a logistic regression model fitted to the prediction ratio has an AUC of 0.777 (supplementary figure 16). However, it is worth mentioning that the sole misclassified sample with a predicted tumor percentage above 7% (45%), also has a prediction ratio less than 0.5. Again, it concerns a LUAD case (classified as SCLC) and again, LUAD is the second highest predicted tumor fraction.
# generate a ROC curve for the prediction ratio
QC_cutoff.results$ROC_label <- ifelse(QC_cutoff.results$eval == "false", 0, 1)
glm.fit <- glm(ROC_label ~ ratio, data = QC_cutoff.results, family = binomial)
xweight <- seq(0, 10, 0.01)
yweight <- predict(glm.fit, list(ratio = xweight), type="response")
plot(x=QC_cutoff.results$ratio, y=QC_cutoff.results$ROC_label)
lines(xweight, yweight)
# make the ROC plot square
par(pty = "s")
# ROC plot
roc(QC_cutoff.results$ROC_label, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, print.auc=TRUE, main = "Supplementary figure 16: ROC curve of the prediction ratio in good quality liquid biopsy samples")
##
## Call:
## roc.default(response = QC_cutoff.results$ROC_label, predictor = glm.fit$fitted.values, plot = TRUE, legacy.axes = TRUE, print.auc = TRUE, main = "Supplementary figure 16: ROC curve of the prediction ratio in good quality liquid biopsy samples")
##
## Data: glm.fit$fitted.values in 8 controls (QC_cutoff.results$ROC_label 0) < 23 cases (QC_cutoff.results$ROC_label 1).
## Area under the curve: 0.7772
In samples that have a cfDNA percentage above 40% (n=31), misclassified samples have a significantly lower estimated tumor percentage (median 3.6%) than correctly classified samples (median 30.4%) (Mann-Whitney U test; z = -3.85; p < .001). A correct prediction is not possible if the tumor percentage is below 7% (n = 7). In samples with a predicted tumor percentage above 7%, the classifier has an accuracy of 96% (23/24 cfDNA samples, 14/15 peritoneal/pleural effusions, and 13/13 plasma samples). Contrary to FFPE samples, the influence of the prediction ratio is less significant (figure 12).
ggplot(QC_cutoff.results, aes(y = tumorpercentage*100, x = ratio, color = classification, shape = type_eval)) +
geom_point(stroke = 2, size = 5) +
scale_color_manual(values = color_panel_12G) +
scale_shape_manual(values = c(16,17,1,2), name = "type and evaluation") +
labs(x = "prediction ratio", y = "tumor percentage", title = "Figure 12: general classification results of all cfDNA samples with a cfDNA% QC cut-off above 40%") +
geom_hline(yintercept = 7, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 1, y = 9, label = "7%", color = "darkgrey") +
geom_vline(xintercept = 0.5, linetype = "dashed", color = "darkgrey") +
annotate("text", x = 0.48, y = 100, label = "0.5", color = "darkgrey", angle = 90) +
theme_classic()
#nrow(cfDNA_gen.results[(cfDNA_gen.results$cfDNA > 40)&(cfDNA_gen.results$eval == "correct"),])
#nrow(cfDNA_gen.results[(cfDNA_gen.results$cfDNA < 40),])
#nrow(QC_cutoff.results[(QC_cutoff.results$tumorpercentage > 0.07),])